Prerequisites

Load required packages

library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)

Dataset

Import processed data, which can be found here.

#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')

Get sample of dataset

#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))

Split Taster data into different Data Frame

tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters

Drop taster_twitter_handle in wines dataframe

wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)

Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points

taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)

Add Rating Classification

Add following classification to wine dataset as found on the website:

Category Rating Description
Classic 98-100 The pinnacle of quality.
Superb 94-97 A great achievement.
Excellent 90-93 Highly recommended.
Very Good 87-89 Often good value; well recommended.
Good 83-86 Suitable for everyday consumption; often good value.
Acceptable 80-82 Can be employed in casual, less-critical circumstances
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)

Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.

normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 

Data Sanitation

Vintage seems to have year 7200

wines <- wines %>%
  filter(vintage<2019)

Data Exploration

Univariate Exploration

Correlation price by points, using DataExplorer library which can be found here

# TODO: IZZY

Alcohol Amount

# TODO: IZZY

Category

# TODO: IZZY

Vintage

Count wines per year (Note: Data has been sanitized)

wines %>%
  group_by(vintage) %>%
  summarize(count = n())
Grouping rowwise data frame strips rowwise nature
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage)) +
  x_lim(1900)
Error in x_lim(1900) : could not find function "x_lim"

Winery

# TODO: Osaki

Province

# TODO: OSAKI

Price

# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_price = mean(price, na.rm=TRUE), 
              sd_price = sd(price, na.rm=TRUE),
              lowest_price = min(price, na.rm=TRUE),
              highest_price = max(price,na.rm=TRUE))

Points

# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_points = mean(points, na.rm=TRUE), 
              sd_points = sd(points, na.rm=TRUE),
              lowest_points = min(points, na.rm=TRUE),
              highest_points = max(points,na.rm=TRUE))

Points distribution by Reviewer

wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))

Multivariate Exploration

Price by Points

Notice the data is “stacked” and the socres range from 80-100

wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")

TODO: IZZY (Why did we log this?)

wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")

Data Analysis

#Find the best province for wine using the average points across the 1,000 samples #drop the descriptions or just select price? set points to max(points)

best_province <- wine_sample %>% 
  group_by(province, points) %>% 
  filter(points > 88.669)
best_province  

Best wine, by variety

#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)

Conclusion

---
title: "Exploring and Analyizing Wine Enthusiast Reviews"
output: html_notebook
---

# Prerequisites

Load required packages
```{r, message=FALSE, warning=FALSE}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
```

# Dataset

Import processed data, which can be found [here](https://github.com/C4rbyn3m4n/wine_reviews_data_analysis/blob/master/data/processed_data/preprocessing.rmd).

```{r}
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
```

Get sample of dataset
```{r}
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
```

### Split Taster data into different Data Frame

```{r}
tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters
```

Drop `taster_twitter_handle` in wines dataframe

```{r}
wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)
```
## Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a "profile" for each reviewer which includes characteristics like: `avg_points`, `sd_points`, and `var_points`
```{r}
taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
```
### Add Rating Classification

Add following classification to wine dataset as found on the [website](https://www.winemag.com/2010/04/09/you-asked-how-is-a-wines-score-determined/):

|Category  | Rating  | Description                                            |
|----------|---------|--------------------------------------------------------|
|Classic   |	98-100 | The pinnacle of quality.                               |
|Superb    |	94-97	 | A great achievement.                                   |
|Excellent |	90-93	 | Highly recommended.                                    |
|Very Good |  87-89	 | Often good value; well recommended.                    |
|Good	     |  83-86	 | Suitable for everyday consumption; often good value.   |
|Acceptable|	80-82	 | Can be employed in casual, less-critical circumstances |

```{r}
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)
```

## Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, `norm_points`, by looking at the number of standard deviatioins a wine is from the reviewer's `avg_points`. This gives use a more accurate representation of which which wines are better than the rest.

```{r}
normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 
```

## Data Sanitation

Vintage seems to have year 7200
``` {r}
wines <- wines %>%
  filter(vintage<2019)
```
# Data Exploration

## Univariate Exploration
Correlation `price` by `points`, using ```DataExplorer``` library which can be found [here](https://datascienceplus.com/blazing-fast-eda-in-r-with-dataexplorer/)
```{r}
# TODO: IZZY
```

### Alcohol Amount
```{r}
# TODO: IZZY
```

### Category
```{r}
# TODO: IZZY
```

### Vintage
Count wines per year (Note: Data has been sanitized)
```{r}
wines %>%
  group_by(vintage) %>%
  summarize(count = n())
```


```{r}
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage)) +
  
```

### Winery
```{r}
# TODO: Osaki
```

### Province
```{r}
# TODO: OSAKI
```

### Price
```{r}
# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_price = mean(price, na.rm=TRUE), 
              sd_price = sd(price, na.rm=TRUE),
              lowest_price = min(price, na.rm=TRUE),
              highest_price = max(price,na.rm=TRUE))
```

### Points
```{r}
# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_points = mean(points, na.rm=TRUE), 
              sd_points = sd(points, na.rm=TRUE),
              lowest_points = min(points, na.rm=TRUE),
              highest_points = max(points,na.rm=TRUE))
```

Points distribution by Reviewer
```{r}
wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))
```

## Multivariate Exploration

## Price by Points
Notice the data is "stacked" and the socres range from 80-100
```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")
```

TODO: IZZY (Why did we log this?)

```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
```

# Data Analysis

#Find the best province for wine using the average points across the 1,000 samples
#drop the descriptions or just select price? set points to max(points)
```{r}
best_province <- wine_sample %>% 
  group_by(province, points) %>% 
  filter(points > 88.669)
best_province  
```


Best wine, by variety
```{r}
#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
```

```{r}
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)
```


# Conclusion
